from scipy.sparse import load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error


def model_rep(y_true, y_pred, average="micro"):
    p = precision_score(y_true, y_pred, average=average)
    r = recall_score(y_true, y_pred, average=average)
    f1score = f1_score(y_true, y_pred, average=average)
    return p, r, f1score


def eu_distances(array1, array2):
    res = 0.0
    for idx in range(len(array1)):
        res = res + (array1[idx] - array2[idx]) ** 2
    return res / len(array1)


def average_distance(y_true_prod, y_pred_prod):
    res = 0.0
    for idx in range(len(y_true_prod)):
        res = res + eu_distances(y_true_prod[idx], y_pred_prod[idx])
    return res / len(y_true_prod)


def sparse_list(array_get):
    col = []
    data = []
    for idx in range(len(array_get)):
        if array_get[idx] != 0:
            col.append(idx)
            data.append(array_get[idx])
    return col, data


def parse_sparse(array1):
    array_get = [item.split(",") for item in array1]
    col = []
    data = []
    for item in array_get:
        col.append(int(float(item[0])))
        data.append(int(float(item[1])))
    return col, data


def load_dataset(x_path, y_path):
    y_ = []
    y_oh = []
    idx = 0
    with open(y_path, encoding="utf8") as f:
        for line in f:
            ll = line.strip().split(",")
            y_item = max([idx * int(float(ll[idx])) for idx in range(5)]) + 1
            y_oh_item = [int(float(item)) for item in ll[:5]]
            y_.append(y_item)
            y_oh.append(y_oh_item)
            idx = idx + 1
            # if idx > 10000:
            #     break
            # if idx % 10000 == 0:
            # print("loading %s" % idx)
    x_ = load_npz(x_path)
    return x_, y_, y_oh


def exp_score(y_pred_prod, classes_get):
    exp_get = 0.0
    for idx in range(len(y_pred_prod)):
        exp_get = exp_get + y_pred_prod[idx] * classes_get[idx]
    # print(exp_get, y_pred_prod, classes_get)
    return exp_get


def exp_score_batch(y_pred_prod, classes_get):
    exp_get = []
    for idx in range(len(y_pred_prod)):
        exp_get.append(exp_score(y_pred_prod[idx], classes_get))
    return exp_get


GEN_DATA_TRAIN_PATH = "../../data/ml-1m_20190508/TRAIN_2019050901_gen.csv"
GEN_DATA_TEST_PATH = "../../data/ml-1m_20190508/TEST_2019050901_gen.csv"
GEN_DATA_TRAIN_X_PATH = "../../data/ml-1m_20190508/TRAIN_X_gen_2019050901.npz"
GEN_DATA_TRAIN_Y_PATH = "../../data/ml-1m_20190508/TRAIN_Y_gen_2019050901.csv"
GEN_DATA_TEST_X_PATH = "../../data/ml-1m_20190508/TEST_X_gen_2019050901.npz"
GEN_DATA_TEST_Y_PATH = "../../data/ml-1m_20190508/TEST_Y_gen_2019050901.csv"
PRINT_RESULT = "%s result get:\nprecision: %s,\nrecall: %s,\nf1: %s,\nMAE: %s,\nMSE: %s,\naverage distance: %s, \nexp_MAE: %s, \nexp_MSE: %s\n---------------\n"

# 读取数据
x_train, y_train, y_train_oh = load_dataset(GEN_DATA_TRAIN_X_PATH, GEN_DATA_TRAIN_Y_PATH)
print("read train data_done")

# 分类器定义
print("start training")
clf = LogisticRegression(penalty="l1", n_jobs=-1)
# clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
print("train done")

# 结果评估
# 训练结果评估
y_pred = clf.predict(x_train)
y_prod = clf.predict_proba(x_train)
p, r, f1score = model_rep(y_train, y_pred)          # 分类预测结果
mae = mean_absolute_error(y_train, y_pred)          # mae
mse = mean_squared_error(y_train, y_pred)           # mse
ad = average_distance(y_train_oh, y_prod)           # 平均距离
exp_sc = exp_score_batch(y_prod, clf.classes_)      # 期望分数
exp_mae = mean_absolute_error(exp_sc, y_pred)       # 期望分数MAE
exp_mse = mean_squared_error(exp_sc, y_pred)        # 期望分数MSE
# 打印结果
print(PRINT_RESULT % ("train", p, r, f1score, mae, mse, ad, exp_mae, exp_mse))

# 测试结果评估
# 读取数据
x_test, y_test, y_test_oh = load_dataset(GEN_DATA_TEST_X_PATH, GEN_DATA_TEST_Y_PATH)
y_pred = clf.predict(x_test)
y_prod = clf.predict_proba(x_test)
p, r, f1score = model_rep(y_test, y_pred)           # 分类预测结果
mae = mean_absolute_error(y_test, y_pred)           # mae
mse = mean_squared_error(y_test, y_pred)            # mse
ad = average_distance(y_test_oh, y_prod)            # 平均距离
exp_sc = exp_score_batch(y_prod, clf.classes_)      # 期望分数
exp_mae = mean_absolute_error(exp_sc, y_pred)       # 期望分数MAE
exp_mse = mean_squared_error(exp_sc, y_pred)        # 期望分数MSE

# 打印结果
print(PRINT_RESULT % ("test", p, r, f1score, mae, mse, ad, exp_mae, exp_mse))
print("read train data_done")
